1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27 package org.smartcrawler;
28
29 import org.apache.commons.configuration.ConfigurationException;
30 import org.apache.log4j.Level;
31 import org.apache.log4j.Logger;
32 import org.smartcrawler.common.Context;
33 import org.smartcrawler.common.ConfigReader;
34 import org.smartcrawler.common.Link;
35 import org.smartcrawler.common.MalformedLinkException;
36 import org.smartcrawler.common.Provider;
37 import org.smartcrawler.common.ProviderFactory;
38 import org.smartcrawler.common.SCLogger;
39
40
41
42 /***
43 * The class which is responsible of starting the crawling processes.
44 *
45 * @author <a href="mailto:pozzad@alice.it">Davide Pozza</a>
46 * @version <tt>$Revision: 1.18 $</tt>
47 */
48 public class Crawler {
49
50 /*** Configuration parameters for the session */
51 private Context context;
52
53 /*** The object which stores and provides the links to process */
54 private Provider provider;
55
56 /***
57 *
58 * @param urlStr The starting url.
59 * @param configFileName The configuration file name to use for the crawling.
60 *
61 * @throws org.smartcrawler.common.MalformedLinkException
62 * @throws org.apache.commons.configuration.ConfigurationException
63 */
64 public Crawler(String urlStr, String configFileName)
65 throws MalformedLinkException, ConfigurationException {
66 Logger.getRootLogger().setLevel(Level.OFF);
67
68 ConfigReader confReader = new ConfigReader();
69 doInit(urlStr,
70 confReader.readConfig(configFileName));
71 }
72
73 /***
74 *
75 * @param urlStr
76 * @param context
77 * @throws org.smartcrawler.common.MalformedLinkException
78 * @throws org.apache.commons.configuration.ConfigurationException
79 */
80 public Crawler(String urlStr, Context context)
81 throws MalformedLinkException, ConfigurationException {
82 doInit(urlStr, context);
83 }
84
85 /***
86 *
87 * @param urlStr
88 * @param context
89 * @throws org.smartcrawler.common.MalformedLinkException
90 * @throws org.apache.commons.configuration.ConfigurationException
91 */
92 public Crawler(String urlStr)
93 throws MalformedLinkException, ConfigurationException {
94 Context defaultConf = new Context();
95 doInit(urlStr, defaultConf);
96 }
97
98 /***
99 *
100 * @param urlStr
101 * @param context
102 * @throws org.smartcrawler.common.MalformedLinkException
103 * @throws org.apache.commons.configuration.ConfigurationException
104 */
105 protected void doInit(String urlStr, Context context)
106 throws MalformedLinkException, ConfigurationException {
107
108 this.context = context;
109
110 SCLogger.initialize(context.getLoggers());
111
112 Link initial = new Link(urlStr);
113 this.context.setInitialLink(initial);
114
115 this.provider = ProviderFactory.instance().create();
116 provider.store(initial);
117
118
119 }
120
121 /***
122 * Starts the {@link org.smartcrawler.DownloadEngine} threads by using the
123 * configuration settings supplied by the
124 * {@link org.smartcrawler.common.SiteConfiguration}.
125 *
126 */
127 public void startEngines() {
128 int enginesThreadNum = 1;
129 enginesThreadNum = this.context.getEngineThreadNumber();
130 for (int i = 0; i < enginesThreadNum; i++) {
131 DownloadEngine eng = new DownloadEngine(context);
132 eng.setName(" [Engine-" + (i + 1) + "] ");
133 eng.start();
134 }
135
136
137
138
139
140 }
141
142 /***
143 * The main method
144 *
145 * @param args The command line arguments.
146 */
147 public static void main(String[] args) {
148 String urlStr = null;
149
150 String configFileName = "bin/conf/smartcrawler-config.xml";
151
152 try {
153 if (args.length > 0) {
154 urlStr = args[0];
155 } else {
156 System.out.println("Please specify a valid starting url.");
157 }
158 if (args.length > 1) {
159 configFileName = args[1];
160 } else {
161 String home = System.getProperty("smartcrawler.home");
162 String sep = System.getProperty("file.separator");
163 if (home != null) {
164 configFileName = home + sep + configFileName;
165 }
166 }
167 new Crawler(urlStr, configFileName).startEngines();
168
169 } catch (MalformedLinkException e) {
170 System.out.println("Invalid initial link! " + urlStr);
171 } catch (Exception e) {
172 System.out.println("Generic error: " + e.getMessage());
173 }
174 }
175 }